Unemployment in India¶

In [6]:
 #import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import calendar
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [7]:
df = pd.read_csv("Unemployment in india.csv")
df.head()
Out[7]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural
In [8]:
 df.tail()
Out[8]:
Region Date Frequency Estimated Unemployment Rate (%) Estimated Employed Estimated Labour Participation Rate (%) Area
763 NaN NaN NaN NaN NaN NaN NaN
764 NaN NaN NaN NaN NaN NaN NaN
765 NaN NaN NaN NaN NaN NaN NaN
766 NaN NaN NaN NaN NaN NaN NaN
767 NaN NaN NaN NaN NaN NaN NaN
In [9]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 7 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Region                                    740 non-null    object 
 1    Date                                     740 non-null    object 
 2    Frequency                                740 non-null    object 
 3    Estimated Unemployment Rate (%)          740 non-null    float64
 4    Estimated Employed                       740 non-null    float64
 5    Estimated Labour Participation Rate (%)  740 non-null    float64
 6   Area                                      740 non-null    object 
dtypes: float64(3), object(4)
memory usage: 42.1+ KB
In [10]:
df.columns = ['region','date','frequency','estimated unemployment rate','estimated employed','estimated labour participation rate','area']

df.head()
Out[10]:
region date frequency estimated unemployment rate estimated employed estimated labour participation rate area
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural
In [11]:
df.shape
Out[11]:
(768, 7)
In [12]:
df.columns
Out[12]:
Index(['region', 'date', 'frequency', 'estimated unemployment rate',
       'estimated employed', 'estimated labour participation rate', 'area'],
      dtype='object')
In [13]:
df.describe()
Out[13]:
estimated unemployment rate estimated employed estimated labour participation rate
count 740.000000 7.400000e+02 740.000000
mean 11.787946 7.204460e+06 42.630122
std 10.721298 8.087988e+06 8.111094
min 0.000000 4.942000e+04 13.330000
25% 4.657500 1.190404e+06 38.062500
50% 8.350000 4.744178e+06 41.160000
75% 15.887500 1.127549e+07 45.505000
max 76.740000 4.577751e+07 72.570000
In [14]:
df.isnull().sum()
Out[14]:
region                                 28
date                                   28
frequency                              28
estimated unemployment rate            28
estimated employed                     28
estimated labour participation rate    28
area                                   28
dtype: int64
In [15]:
df.duplicated().any()
Out[15]:
True
In [16]:
df.area.value_counts()
Out[16]:
area
Urban    381
Rural    359
Name: count, dtype: int64
In [17]:
df['Date'] = pd.to_datetime(df['date'],dayfirst = True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   region                               740 non-null    object        
 1   date                                 740 non-null    object        
 2   frequency                            740 non-null    object        
 3   estimated unemployment rate          740 non-null    float64       
 4   estimated employed                   740 non-null    float64       
 5   estimated labour participation rate  740 non-null    float64       
 6   area                                 740 non-null    object        
 7   Date                                 740 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 48.1+ KB
In [18]:
df['month_int'] = df['Date'].dt.month
df.head()
Out[18]:
region date frequency estimated unemployment rate estimated employed estimated labour participation rate area Date month_int
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural 2019-05-31 5.0
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural 2019-06-30 6.0
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural 2019-07-31 7.0
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural 2019-08-31 8.0
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural 2019-09-30 9.0
In [19]:
df = df.dropna()
df['month'] = df['month_int'].astype(int).apply(lambda x: calendar.month_abbr[x])
df.head()
Out[19]:
region date frequency estimated unemployment rate estimated employed estimated labour participation rate area Date month_int month
0 Andhra Pradesh 31-05-2019 Monthly 3.65 11999139.0 43.24 Rural 2019-05-31 5.0 May
1 Andhra Pradesh 30-06-2019 Monthly 3.05 11755881.0 42.05 Rural 2019-06-30 6.0 Jun
2 Andhra Pradesh 31-07-2019 Monthly 3.75 12086707.0 43.50 Rural 2019-07-31 7.0 Jul
3 Andhra Pradesh 31-08-2019 Monthly 3.32 12285693.0 43.97 Rural 2019-08-31 8.0 Aug
4 Andhra Pradesh 30-09-2019 Monthly 5.17 12256762.0 44.68 Rural 2019-09-30 9.0 Sep
In [20]:
data = df.groupby(['month'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
data=pd.DataFrame(data).reset_index()
In [21]:
month = data.month
unemployment_rate = data['estimated unemployment rate']
labour_participation_rate = data['estimated labour participation rate']
fig = go.Figure()
fig.add_trace(go.Bar(x = month,y = unemployment_rate,name = 'Unemployment Rate'))
fig.add_trace(go.Bar(x = month,y = labour_participation_rate,name = 'Labour Participation Rate'))
fig.update_layout(title = 'Unemployment Rate and Labour Participation',xaxis = {'categoryorder':'array','categoryarray':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct']} )
In [22]:
fig.show()
In [23]:
import plotly.express as px
In [24]:
fig = px.bar(data,x='month',y='estimated employed',color='month',category_orders ={'month':['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct']},
title='Estimated employed people from Jan 2020 to Oct 2020')
fig.show()
In [25]:
area = df.groupby(['area'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
area = pd.DataFrame(area).reset_index()
In [26]:
# Box plot

fig = px.box(data_frame=df,x='area',y='estimated unemployment rate',color='area',title='Unemployment rate')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [27]:
# average unemployment rate bar plot

fig = px.bar(area,x='area',y='estimated unemployment rate',color='area',title='Average unemployment rate (area)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [28]:
 # Bar plot Unemployment Rate (monthly)

fig = px.bar(df,x='area',y='estimated unemployment rate',animation_frame='month',color='area',title='Unemployment rate from Jan 2020 to Oct 2020(State)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [29]:
# Filter data before and during lockdown

before_lockdown = df[df['Date'] < '2020-03-25']
In [30]:
during_lockdown = df[df['Date'] >= '2020-03-25']

# Average Unemployment Rate before and during lockdown

avg_unemployment_before = before_lockdown['estimated unemployment rate'].mean()
avg_unemployment_during = during_lockdown['estimated unemployment rate'].mean()
print(f"Average Unemployment Rate before lockdown: {avg_unemployment_before:.2f}%")
print(f"Average Unemployment Rate during lockdown: {avg_unemployment_during:.2f}%")

# Percentage change in Unemployment Rate

percentage_change = ((avg_unemployment_during - avg_unemployment_before) /avg_unemployment_before) * 100
print(f"Percentage Change in Unemployment Rate: {percentage_change:.2f}%")
Average Unemployment Rate before lockdown: 9.51%
Average Unemployment Rate during lockdown: 17.77%
Percentage Change in Unemployment Rate: 86.91%
In [31]:
fig=px.scatter_geo(df,'region',color='region',hover_name='region',size='estimated unemployment rate',animation_frame='month',scope='asia',title='Impact of lockdown on employment in India')
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] =2000
fig.update_geos(lataxis_range=[5,40],lonaxis_range=[65,100],oceancolor='lightblue',showocean=True)
fig.show()
In [32]:
 df.region.unique()
Out[32]:
array(['Andhra Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Delhi', 'Goa',
       'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jammu & Kashmir',
       'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh',
       'Maharashtra', 'Meghalaya', 'Odisha', 'Puducherry', 'Punjab',
       'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
       'Uttar Pradesh', 'Uttarakhand', 'West Bengal', 'Chandigarh'],
      dtype=object)
In [33]:
# numeric data grouped by region

region = df.groupby(['region'])[['estimated unemployment rate','estimated employed','estimated labour participation rate']].mean()
region = pd.DataFrame(region).reset_index()
In [34]:
import plotly.express as px

# Specify dimensions and color parameter for the scatter matrix plot
dimensions = ['estimated unemployment rate', 'estimated employed', 'estimated labour participation rate']
color_column = 'region'

# Create scatter matrix plot with Plotly Express
fig = px.scatter_matrix(df,dimensions=dimensions,color=color_column,title='Scatter Matrix Plot Colored by Region')

# Display the plot
fig.show()
In [47]:
import dash
import dash_core_components as dcc
import dash_html_components as html

app = dash.Dash(__name__)

# Define layout
app.layout = html.Div([
dcc.Graph(id='unemployment-trend', figure={'data': [{'x': df['Date'], 'y': df['estimated unemployment rate'],'type': 'line', 'name': 'Unemployment Rate'}],'layout': {'title': 'Unemployment Rate Over Time'}})])

if __name__ == '__main__':
    app.run_server(debug=True)
In [36]:
# Average Unemployment Rate

fig = px.bar(region,x='region',y='estimated unemployment rate',color='region',title='Average unemployment rate(region)')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
In [37]:
fig = px.bar(df,x='region',y='estimated unemployment rate',animation_frame='month',color='area',title='Unemployment rate from Jan 2020 to Oct 2020')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] =2000
fig.show()
In [38]:
unemployment =df.groupby(['region','area'])['estimated unemployment rate'].mean().reset_index()
unemployment.head()
Out[38]:
region area estimated unemployment rate
0 Andhra Pradesh Rural 5.526429
1 Andhra Pradesh Urban 9.427857
2 Assam Rural 4.490833
3 Assam Urban 8.088571
4 Bihar Rural 16.770000
In [39]:
fig = px.sunburst(unemployment,path=['region','area'],values='estimated unemployment rate',title ='Unemployment rate in area and region',height=600)
fig.show()
In [40]:
# data representation before and after lockdown

before_lockdown = df[(df['month_int']>=1) &(df['month_int'] <4)]
after_lockdown = df[(df['month_int'] >=4) & (df['month_int'] <=6)]
In [48]:
af_lockdown = after_lockdown.groupby('area')['estimated unemployment rate'].mean().reset_index()
lockdown = before_lockdown.groupby('area')['estimated unemployment rate'].mean().reset_index()
lockdown['unemployment rate before lockdown'] = af_lockdown['estimated unemployment rate']
In [42]:
lockdown.columns = ['area','unemployment rate before lockdown','unemployment rate after lockdown']
lockdown.head()
Out[42]:
area unemployment rate before lockdown unemployment rate after lockdown
0 Rural 8.735132 13.909843
1 Urban 11.561951 17.177293
In [43]:
# unenployment rate change after lockdown

lockdown['rate change in unemployment'] =round(lockdown['unemployment rate before lockdown']-lockdown['unemployment rate before lockdown']
/lockdown['unemployment rate after lockdown'],2)
In [44]:
fig = px.bar(lockdown,x='area',y='rate change in unemployment',color='rate change in unemployment',title='Percentage change in Unemployment rate in each state after lockdown',template='ggplot2')
fig.update_layout(xaxis={'categoryorder':'total ascending'})
fig.show()
In [45]:
# Calculate correlation matrix
correlation_matrix = df[['estimated unemployment rate', 'estimated employed','estimated labour participation rate']].corr()

# Plot correlation heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image
In [46]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
model = ARIMA(df['estimated unemployment rate'], order=(1, 1, 1))
model_fit = model.fit()

# Forecast future unemployment rates
forecast = model_fit.forecast(steps=12)
print(forecast)
C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

An unsupported index was provided and will be ignored when e.g. forecasting.

740    10.456696
741    10.723949
742    10.843648
743    10.897259
744    10.921271
745    10.932025
746    10.936842
747    10.939000
748    10.939966
749    10.940399
750    10.940593
751    10.940679
Name: predicted_mean, dtype: float64
C:\Users\yrath\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning:

No supported index is available. Prediction results will be given with an integer index beginning at `start`.

In [ ]: